Business Question
The CEO of a Health Insurance company wants to expand the business by offering Car Insurance. To do this, he asked his employees to call some customers and ask if they would buy Car insurance if the company offered it. Through this survey, they obtained a large database containing customer characteristics and their survey responses.
Now, the company is ready to launch the new service, and the sales team have a list of 127,000 customers to make phone calls and offer the new Insurance. The company only has the resourses to call 20,000 customers, so he hired a Data Scientist to study those customers and define the 20,000 with best chances of buying the Insurance.
Challenge
Build a model that classifies a list of customers, informing the customer's predisposition to take out auto insurance or not. With this solution, the sales team hopes to be able to prioritize people with the greatest interest in the new product and thus optimize the campaign by only making contacts with the customers most likely to make the purchase.
import optuna
import pickle
import psycopg2
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import scikitplot as skplt
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy import stats
from sqlalchemy.sql import text
from IPython.display import HTML
from sqlalchemy import create_engine
from dataprep.eda import create_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings('ignore')
def jupyter_settings():
%matplotlib inline
plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 24
display( HTML( '<style>.container { width:100% !important; }</style>') )
sns.set()
def cramer_v(x, y):
cm = pd.crosstab(x, y).values
n = cm.sum()
r, k = cm.shape
chi2 = stats.chi2_contingency( cm )[0]
chi2corr = max(0, chi2 - (k - 1) * (r - 1) / (n - 1) )
kcorr = k - (k - 1) ** 2 / (n - 1)
rcorr = r - (r - 1) ** 2 / (n - 1)
return np.sqrt((chi2corr / n) / ( min(kcorr - 1, rcorr - 1 )))
def cross_validation(X, y, model,model_name = "Model", test_size = 0.3, cv = 10, top_k = 2000, verbose = True):
list_scores = []
for i in range(1, cv + 1):
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = test_size )
# Preprocess data
X_train, X_test, y_train, y_test = preprocessing(X_train, X_test, y_train, y_test)
# Model training
model = model.fit(X_train, y_train)
# Model evaluation
scores = print_scores(y_test = y_test,
y_pred = model.predict(X_test),
predict_proba = model.predict_proba(X_test),
top_k = top_k,
verbose = verbose)
list_scores.append(scores)
list_scores = np.array(list_scores).transpose()
data_frame = pd.DataFrame( {'Model Name': model_name,
'Precision Top-K CV': np.round( np.mean( list_scores[0] ), 2 ).astype(str ) + ' +/- ' + np.round( np.std( list_scores[0] ), 4 ).astype( str ),
'Recall Top-K CV': np.round( np.mean( list_scores[1] ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( list_scores[1] ), 4 ).astype( str ),
'F1 Score CV': np.round( np.mean( list_scores[2] ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( list_scores[2] ), 4 ).astype( str ),
'AUC Score CV': np.round( np.mean( list_scores[3] ), 2 ).astype( str ) + ' +/- ' + np.round( np.std( list_scores[3] ), 4 ).astype( str )}, index=[0] )
return data_frame
def plot_curves(y_test, predict_proba, verbose):
fpr, tpr, _ = roc_curve(y_test.values, predict_proba[:,1])
auc = np.round(roc_auc_score(y_test, predict_proba[::,1]),3)
auc = auc
if verbose == True:
# plt.figure(figsize = (4, 4))
# plt.plot(fpr,tpr)
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.title('ROC Curve | AUC Score: ' + str( auc ))
plt.figure(figsize = (4, 4))
skplt.metrics.plot_cumulative_gain(y_test, predict_proba, figsize = (4,4))
return auc
def score_top_k(y_test, predict_proba, top_k):
df = pd.DataFrame()
df['predictions'] = predict_proba[:,1]
df['y_test'] = y_test.values
df = df.sort_values('predictions', ascending = False).reset_index(drop = True).reset_index()
df['score'] = df.apply(lambda x: 1 if ( x['index'] <= top_k and x['y_test'] == 1) else 0, axis = 1 )
precision = df['score'].sum() / top_k
recall = df['score'].sum() / df['y_test'].sum()
return {'precision': precision, 'recall': recall}
def print_scores(y_test, y_pred, predict_proba, top_k,verbose):
precision = np.round(precision_score(y_test, y_pred, pos_label = 1),3)
recall = np.round(recall_score (y_test, y_pred, pos_label = 1),3)
f1 = np.round(f1_score (y_test, y_pred, pos_label = 1),3)
auc = plot_curves(y_test, predict_proba, verbose)
precision_top_k = score_top_k(y_test, predict_proba, top_k)['precision']
recall_top_k = score_top_k(y_test, predict_proba, top_k)['recall']
if verbose == True:
print("Precision top K: ", precision_top_k, " | Recall Top K: ", recall_top_k, f1, " | AUC Score: ", auc)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
scores = [precision_top_k, recall_top_k, f1, auc]
return scores
def preprocessing(X_train, X_test, y_train, y_test):
# Transformation
# Gender
X_train['gender'] = X_train['gender'].apply(lambda x: 0 if x == 'Male' else 1)
X_test['gender'] = X_test['gender'].apply(lambda x: 0 if x == 'Male' else 1)
# Vehicle damage
X_train['vehicle_damage'] = X_train['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)
X_test['vehicle_damage'] = X_test['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)
# Vehicle age
X_train['vehicle_age'] = X_train['vehicle_age'].apply(lambda x: 0 if x == '< 1 Year' else 1 if x == '1-2 Year' else 2)
X_test['vehicle_age'] = X_test['vehicle_age'].apply(lambda x: 0 if x == '< 1 Year' else 1 if x == '1-2 Year' else 2)
# region_code - Frequency Encoding / Target Encoding / Weighted Target Encoding
# Target Encoding
aux = X_train.copy()
aux['response'] = y_train
target_encode_region_code = aux.groupby('region_code')['response'].mean()
X_train['region_code'] = X_train['region_code'].map( target_encode_region_code )
X_test['region_code'] = X_test['region_code'].map( target_encode_region_code )
X_test['region_code'].fillna(0, inplace = True)
# policy sales channel - Frequency Encoding / Target Encoding / Weighted Target Encoding
# Frequency encoding
target_encode_policy_sales = aux.groupby('policy_sales_channel')['response'].count() / len(insurance_data5)
X_train['policy_sales_channel'] = X_train['policy_sales_channel'].map( target_encode_policy_sales )
X_test['policy_sales_channel'] = X_test['policy_sales_channel'].map( target_encode_policy_sales )
X_test['policy_sales_channel'].fillna(0, inplace = True)
# Feature selection
X_train = X_train[['age', 'region_code', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage', 'vehicle_damage']]
X_test = X_test[['age', 'region_code', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage', 'vehicle_damage']]
return X_train, X_test, y_train, y_test
jupyter_settings()
# # Engine to connect database
# engine = create_engine('postgresql+psycopg2://user:password@hostname/database_name')
# conn = engine.connect()
# query = '''
# SELECT
# pu.id,
# gender,
# age,
# region_code,
# policy_sales_channel,
# previously_insured,
# annual_premium,
# vintage,
# response,
# driving_license,
# vehicle_age,
# vehicle_damage
# FROM
# pa004.users pu
# LEFT JOIN pa004.insurance pi ON (pi.id = pu.id)
# LEFT JOIN pa004.vehicle pv ON (pi.id = pv.id);
# '''
# with conn.execution_options(autocommit=True) as conn:
# query = conn.execute(text(query))
# df = pd.DataFrame(query.fetchall())
# df.columns = ['id','gender','age','region_code','policy_sales_channel','previously_insured',
# 'annual_premium','vintage','response','driving_license','vehicle_age','vehicle_damage']
# conn.close()
df = pd.read_csv('../data/insurance_data.csv')
df.head()
| id | gender | age | region_code | policy_sales_channel | previously_insured | annual_premium | vintage | response | driving_license | vehicle_age | vehicle_damage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7 | Male | 23 | 11.0 | 152.0 | 0 | 23367.0 | 249 | 0 | 1 | < 1 Year | Yes |
| 1 | 13 | Female | 41 | 15.0 | 14.0 | 1 | 31409.0 | 221 | 0 | 1 | 1-2 Year | No |
| 2 | 18 | Female | 25 | 35.0 | 152.0 | 1 | 46622.0 | 299 | 0 | 1 | < 1 Year | No |
| 3 | 31 | Female | 26 | 8.0 | 160.0 | 0 | 2630.0 | 136 | 0 | 1 | < 1 Year | No |
| 4 | 39 | Male | 45 | 8.0 | 124.0 | 0 | 42297.0 | 264 | 0 | 1 | 1-2 Year | Yes |
insurance_data1 = df.copy()
insurance_data1.shape
(381109, 12)
insurance_data1.dtypes
id int64 gender object age int64 region_code float64 policy_sales_channel float64 previously_insured int64 annual_premium float64 vintage int64 response int64 driving_license int64 vehicle_age object vehicle_damage object dtype: object
insurance_data1.isna().sum()
id 0 gender 0 age 0 region_code 0 policy_sales_channel 0 previously_insured 0 annual_premium 0 vintage 0 response 0 driving_license 0 vehicle_age 0 vehicle_damage 0 dtype: int64
numerical_columns = ['annual_premium','policy_sales_channel', 'vintage','age']
data_statistics = insurance_data1[numerical_columns].describe().reset_index()
skew = pd.DataFrame(insurance_data1[numerical_columns].apply(lambda x: x.skew())).T
data_statistics = pd.concat([data_statistics, skew]).fillna('Skew')
kurtosis = pd.DataFrame(insurance_data1[numerical_columns].apply(lambda x: x.kurtosis())).T
data_statistics = pd.concat([data_statistics, kurtosis]).fillna('Kurtosis')
data_statistics.set_index('index')
data_statistics
| index | annual_premium | policy_sales_channel | vintage | age | |
|---|---|---|---|---|---|
| 0 | count | 381109.000000 | 381109.000000 | 381109.000000 | 381109.000000 |
| 1 | mean | 30564.389581 | 112.034295 | 154.347397 | 38.822584 |
| 2 | std | 17213.155057 | 54.203995 | 83.671304 | 15.511611 |
| 3 | min | 2630.000000 | 1.000000 | 10.000000 | 20.000000 |
| 4 | 25% | 24405.000000 | 29.000000 | 82.000000 | 25.000000 |
| 5 | 50% | 31669.000000 | 133.000000 | 154.000000 | 36.000000 |
| 6 | 75% | 39400.000000 | 152.000000 | 227.000000 | 49.000000 |
| 7 | max | 540165.000000 | 163.000000 | 299.000000 | 85.000000 |
| 0 | Skew | 1.766087 | -0.900008 | 0.003030 | 0.672539 |
| 0 | Kurtosis | 34.004569 | -0.970810 | -1.200688 | -0.565655 |
fig, axis = plt.subplots(figsize = (19, 12))
plt.subplot(2, 3, 1)
sns.barplot(x = insurance_data1['previously_insured'].value_counts().index.values,
y = insurance_data1['previously_insured'].value_counts().values)
plt.title('previously_insured')
plt.subplot(2, 3, 2)
sns.barplot(x = insurance_data1['response'].value_counts().index.values,
y = insurance_data1['response'].value_counts().values)
plt.title('Response')
plt.subplot(2, 3, 3)
sns.barplot(x = insurance_data1['gender'].value_counts().index.values,
y = insurance_data1['gender'].value_counts().values)
plt.title('Gender')
plt.subplot(2, 3, 4)
sns.barplot(x = insurance_data1['vehicle_age'].value_counts().index.values,
y = insurance_data1['vehicle_age'].value_counts().values)
plt.title('vehicle_age')
plt.subplot(2, 3, 5)
sns.barplot(x = insurance_data1['vehicle_damage'].value_counts().index.values,
y = insurance_data1['vehicle_damage'].value_counts().values)
plt.title('vehicle_damage')
plt.subplot(2, 3, 6)
sns.barplot(x = insurance_data1['driving_license'].value_counts().index.values,
y = insurance_data1['driving_license'].value_counts().values)
plt.title('driving_license')
Text(0.5, 1.0, 'driving_license')
insurance_data1['previously_insured'].value_counts().values
array([206481, 174628])
insurance_data2 = insurance_data1.copy()
# insurance_data2['important_sales_channel'] = insurance_data2['policy_sales_channel'].apply(lambda x: x if x in [26.0, 124.0, 152.0,160.0] else 0)
This feature didn't have relevant impact on the result so I decided to not use it.
insurance_data3 = insurance_data2.copy()
Nothing to filter
insurance_data4 = insurance_data3.copy()
report = create_report(insurance_data4).show()
0%| …
| Number of Variables | 12 |
|---|---|
| Number of Rows | 381109 |
| Missing Cells | 0 |
| Missing Cells (%) | 0.0% |
| Duplicate Rows | 0 |
| Duplicate Rows (%) | 0.0% |
| Total Size in Memory | 93.9 MB |
| Average Row Size in Memory | 258.5 B |
| Variable Types |
|
| id is uniformly distributed | Uniform |
|---|---|
| age is skewed | Skewed |
| region_code is skewed | Skewed |
| policy_sales_channel is skewed | Skewed |
| annual_premium is skewed | Skewed |
| previously_insured has constant length 1 | Constant Length |
| response has constant length 1 | Constant Length |
| driving_license has constant length 1 | Constant Length |
numerical
| Approximate Distinct Count | 381109 |
|---|---|
| Approximate Unique (%) | 100.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 6097744 |
| Mean | 190555 |
| Minimum | 1 |
| Maximum | 381109 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1 |
|---|---|
| 5-th Percentile | 19056.4 |
| Q1 | 95278 |
| Median | 190555 |
| Q3 | 285832 |
| 95-th Percentile | 362053.6 |
| Maximum | 381109 |
| Range | 381108 |
| IQR | 190554 |
| Mean | 190555 |
|---|---|
| Standard Deviation | 110016.8362 |
| Variance | 1.2104e+10 |
| Sum | 7.2622e+10 |
| Skewness | -8.0711e-18 |
| Kurtosis | -1.2 |
| Coefficient of Variation | 0.5773 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 26646561 |
| Mean | 4.9185 |
|---|---|
| Standard Deviation | 0.9967 |
| Median | 4 |
| Minimum | 4 |
| Maximum | 6 |
| 1st row | Male |
|---|---|
| 2nd row | Female |
| 3rd row | Female |
| 4th row | Female |
| 5th row | Male |
| Count | 1874476 |
|---|---|
| Lowercase Letter | 1493367 |
| Space Separator | 0 |
| Uppercase Letter | 381109 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
numerical
| Approximate Distinct Count | 66 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 6097744 |
| Mean | 38.8226 |
| Minimum | 20 |
| Maximum | 85 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 20 |
|---|---|
| 5-th Percentile | 21 |
| Q1 | 25 |
| Median | 36 |
| Q3 | 49 |
| 95-th Percentile | 69 |
| Maximum | 85 |
| Range | 65 |
| IQR | 24 |
| Mean | 38.8226 |
|---|---|
| Standard Deviation | 15.5116 |
| Variance | 240.6101 |
| Sum | 1.4796e+07 |
| Skewness | 0.6725 |
| Kurtosis | -0.5657 |
| Coefficient of Variation | 0.3996 |
numerical
| Approximate Distinct Count | 53 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 6097744 |
| Mean | 26.3888 |
| Minimum | 0 |
| Maximum | 52 |
| Zeros | 2021 |
| Zeros (%) | 0.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 5 |
| Q1 | 15 |
| Median | 28 |
| Q3 | 35 |
| 95-th Percentile | 47 |
| Maximum | 52 |
| Range | 52 |
| IQR | 20 |
| Mean | 26.3888 |
|---|---|
| Standard Deviation | 13.2299 |
| Variance | 175.0299 |
| Sum | 1.0057e+07 |
| Skewness | -0.1153 |
| Kurtosis | -0.8679 |
| Coefficient of Variation | 0.5013 |
numerical
| Approximate Distinct Count | 155 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 6097744 |
| Mean | 112.0343 |
| Minimum | 1 |
| Maximum | 163 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1 |
|---|---|
| 5-th Percentile | 26 |
| Q1 | 29 |
| Median | 133 |
| Q3 | 152 |
| 95-th Percentile | 160 |
| Maximum | 163 |
| Range | 162 |
| IQR | 123 |
| Mean | 112.0343 |
|---|---|
| Standard Deviation | 54.204 |
| Variance | 2938.073 |
| Sum | 4.2697e+07 |
| Skewness | -0.9 |
| Kurtosis | -0.9708 |
| Coefficient of Variation | 0.4838 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 25153194 |
| Mean | 1 |
|---|---|
| Standard Deviation | 0 |
| Median | 1 |
| Minimum | 1 |
| Maximum | 1 |
| 1st row | 0 |
|---|---|
| 2nd row | 1 |
| 3rd row | 1 |
| 4th row | 0 |
| 5th row | 0 |
| Count | 0 |
|---|---|
| Lowercase Letter | 0 |
| Space Separator | 0 |
| Uppercase Letter | 0 |
| Dash Punctuation | 0 |
| Decimal Number | 381109 |
numerical
| Approximate Distinct Count | 48838 |
|---|---|
| Approximate Unique (%) | 12.8% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 6097744 |
| Mean | 30564.3896 |
| Minimum | 2630 |
| Maximum | 540165 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 2630 |
|---|---|
| 5-th Percentile | 2630 |
| Q1 | 24405 |
| Median | 31669 |
| Q3 | 39400 |
| 95-th Percentile | 55176 |
| Maximum | 540165 |
| Range | 537535 |
| IQR | 14995 |
| Mean | 30564.3896 |
|---|---|
| Standard Deviation | 17213.1551 |
| Variance | 2.9629e+08 |
| Sum | 1.1648e+10 |
| Skewness | 1.7661 |
| Kurtosis | 34.0041 |
| Coefficient of Variation | 0.5632 |
numerical
| Approximate Distinct Count | 290 |
|---|---|
| Approximate Unique (%) | 0.1% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 6097744 |
| Mean | 154.3474 |
| Minimum | 10 |
| Maximum | 299 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 10 |
|---|---|
| 5-th Percentile | 24 |
| Q1 | 82 |
| Median | 154 |
| Q3 | 227 |
| 95-th Percentile | 285 |
| Maximum | 299 |
| Range | 289 |
| IQR | 145 |
| Mean | 154.3474 |
|---|---|
| Standard Deviation | 83.6713 |
| Variance | 7000.8871 |
| Sum | 5.8823e+07 |
| Skewness | 0.00303 |
| Kurtosis | -1.2007 |
| Coefficient of Variation | 0.5421 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 25153194 |
| Mean | 1 |
|---|---|
| Standard Deviation | 0 |
| Median | 1 |
| Minimum | 1 |
| Maximum | 1 |
| 1st row | 0 |
|---|---|
| 2nd row | 0 |
| 3rd row | 0 |
| 4th row | 0 |
| 5th row | 0 |
| Count | 0 |
|---|---|
| Lowercase Letter | 0 |
| Space Separator | 0 |
| Uppercase Letter | 0 |
| Dash Punctuation | 0 |
| Decimal Number | 381109 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 25153194 |
| Mean | 1 |
|---|---|
| Standard Deviation | 0 |
| Median | 1 |
| Minimum | 1 |
| Maximum | 1 |
| 1st row | 1 |
|---|---|
| 2nd row | 1 |
| 3rd row | 1 |
| 4th row | 1 |
| 5th row | 1 |
| Count | 0 |
|---|---|
| Lowercase Letter | 0 |
| Space Separator | 0 |
| Uppercase Letter | 0 |
| Dash Punctuation | 0 |
| Decimal Number | 381109 |
categorical
| Approximate Distinct Count | 3 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 27836964 |
| Mean | 8.042 |
|---|---|
| Standard Deviation | 0.2006 |
| Median | 8 |
| Minimum | 8 |
| Maximum | 9 |
| 1st row | < 1 Year |
|---|---|
| 2nd row | 1-2 Year |
| 3rd row | < 1 Year |
| 4th row | < 1 Year |
| 5th row | 1-2 Year |
| Count | 1540443 |
|---|---|
| Lowercase Letter | 1159334 |
| Space Separator | 561902 |
| Uppercase Letter | 381109 |
| Dash Punctuation | 200316 |
| Decimal Number | 581425 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 25726716 |
| Mean | 2.5049 |
|---|---|
| Standard Deviation | 0.5 |
| Median | 3 |
| Minimum | 2 |
| Maximum | 3 |
| 1st row | Yes |
|---|---|
| 2nd row | No |
| 3rd row | No |
| 4th row | No |
| 5th row | Yes |
| Count | 954631 |
|---|---|
| Lowercase Letter | 573522 |
| Space Separator | 0 |
| Uppercase Letter | 381109 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
# Remove driving licence
# Age
plt.figure(figsize = (18, 4))
plt.subplot(1, 3, 1)
sns.boxplot(x = 'response', y = 'age', data = insurance_data4)
plt.title('Age x Response')
plt.subplot(1, 3, 2)
sns.histplot(data = insurance_data4['age'][insurance_data4['response'] == 1] )
plt.title('Age x Response = "Yes"')
plt.subplot(1, 3, 3)
plt.title('Age x Response = "No"')
sns.histplot(data = insurance_data4['age'][insurance_data4['response'] == 0] );
aux = insurance_data4[['age', 'response']]
aux1 = aux.groupby('age').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['age', 'response'])).reset_index().sort_values('age')
aux3 = pd.DataFrame(aux2.groupby('age').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'age')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
# Policy sales channel
plt.figure(figsize = (22, 4 ))
bar1 = sns.barplot(x = 'age', y = 'response', data = aux1, color = 'darkblue', errorbar = None)
bar2 = sns.barplot(x = 'age', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar = None)
# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Age')
# show the graph
plt.show()
# Annual premium
plt.figure(figsize = (18, 4))
sns.boxplot(x = 'response', y = 'annual_premium', data = insurance_data4[insurance_data4['annual_premium'] < 100000])
plt.title('annual premium x Response')
plt.subplot(1, 3, 2)
sns.distplot(insurance_data4[insurance_data4['response'] == 1]['annual_premium'], kde = False )
plt.subplot(1, 3, 2)
sns.distplot(insurance_data4[insurance_data4['response'] == 0]['annual_premium'], kde = False )
plt.title('Response: Yes x No')
plt.subplot(1, 3, 3)
aux = insurance_data4[(insurance_data4['annual_premium'] < 100000) & (insurance_data4['annual_premium'] > 15000)]
aux1 = aux[aux['response'] == 1]
sns.histplot(data = aux1['annual_premium'] )
plt.subplot(1, 3, 3)
aux2 = aux[aux['response'] == 0]
sns.histplot(data = aux2['annual_premium'])
plt.title('Response: Yes x No');
# Region code
plt.figure(figsize = (6, 4))
aux = insurance_data4[['region_code', 'id', 'response']].groupby(['region_code','response']).count().reset_index()
sns.scatterplot(data = aux, x = 'region_code', y = 'id', hue = 'response')
<AxesSubplot: xlabel='region_code', ylabel='id'>
# Gender
aux = insurance_data4[['id', 'gender','response']].groupby(['gender','response']).count().reset_index()
aux['total_gender'] = aux.apply(lambda x: sum(aux['id'][aux['gender'] == 'Female']) if x['gender'] == 'Female'
else sum(aux['id'][aux['gender'] == 'Male']), axis = 1)
aux['total_gender'] = aux['id'] / aux['total_gender']
plt.figure(figsize = (6, 4))
sns.barplot(data = aux, x = 'gender', y = 'total_gender', hue = 'response')
plt.title('Response proportion by Gender');
# Previously insured
aux = insurance_data4[['id', 'previously_insured','response']].groupby(['previously_insured','response']).count().reset_index()
aux['total_previously_insured'] = aux.apply(lambda x: sum(aux['id'][aux['previously_insured'] == 0]) if x['previously_insured'] == 0
else sum(aux['id'][aux['previously_insured'] == 1]), axis = 1)
aux['total_previously_insured'] = aux['id'] / aux['total_previously_insured']
plt.figure(figsize = (6, 4))
sns.barplot(data = aux, x = 'previously_insured', y = 'total_previously_insured', hue = 'response')
plt.title('Response proportion by Previously Insured');
aux
| previously_insured | response | id | total_previously_insured | |
|---|---|---|---|---|
| 0 | 0 | 0 | 159929 | 0.774546 |
| 1 | 0 | 1 | 46552 | 0.225454 |
| 2 | 1 | 0 | 174470 | 0.999095 |
| 3 | 1 | 1 | 158 | 0.000905 |
Data leakage?
aux = insurance_data4[['vehicle_age', 'response']]
aux1 = aux.groupby('vehicle_age').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['vehicle_age', 'response'])).reset_index().sort_values('vehicle_age')
aux3 = pd.DataFrame(aux2.groupby('vehicle_age').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'vehicle_age')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
# Policy sales channel
plt.figure(figsize = (6, 4 ))
bar1 = sns.barplot(x = 'vehicle_age', y = 'response', data = aux1, color = 'darkblue', errorbar = None)
bar2 = sns.barplot(x = 'vehicle_age', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar = None)
# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1.25)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Vehicle Age')
# show the graph
plt.show()
aux = insurance_data4[['policy_sales_channel', 'response']]
aux1 = aux.groupby('policy_sales_channel').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['policy_sales_channel', 'response'])).reset_index().sort_values('policy_sales_channel')
aux3 = pd.DataFrame(aux2.groupby('policy_sales_channel').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'policy_sales_channel')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
# Policy sales channel
plt.figure(figsize = (22, 4 ))
bar1 = sns.barplot(x = 'policy_sales_channel', y = 'response', data = aux1, color = 'darkblue', errorbar = None)
bar2 = sns.barplot(x = 'policy_sales_channel', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar = None)
# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Policy Sales Channel')
# show the graph
plt.show()
aux = insurance_data4[['vintage', 'response']]
aux1 = aux.groupby('vintage').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['vintage', 'response'])).reset_index().sort_values('vintage')
aux3 = pd.DataFrame(aux2.groupby('vintage').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'vintage')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
# Policy sales channel
plt.figure(figsize = (22, 4 ))
bar1 = sns.barplot(x = 'vintage', y = 'response', data = aux1, color = 'darkblue', errorbar = None)
bar2 = sns.barplot(x = 'vintage', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar = None)
# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Vintage')
# show the graph
plt.show()
plt.figure(figsize = (5, 4 ))
sns.boxplot(x = 'response', y = 'vintage', data = insurance_data4)
plt.title('Vintage distribution by Response');
No information from this feature. Delete?
aux = insurance_data4[['vehicle_damage', 'response']]
aux1 = aux.groupby('vehicle_damage').count().reset_index()
aux1['response'] = 1
aux2 = pd.DataFrame(aux.value_counts(['vehicle_damage', 'response'])).reset_index().sort_values('vehicle_damage')
aux3 = pd.DataFrame(aux2.groupby('vehicle_damage').sum()[0]).reset_index()
aux3 = pd.merge(aux2, aux3, how = 'left', on = 'vehicle_damage')
aux3['percentage'] = aux3['0_x'] / aux3['0_y']
# Policy sales channel
plt.figure(figsize = (5, 4 ))
bar1 = sns.barplot(x = 'vehicle_damage', y = 'response', data = aux1, color = 'darkblue', errorbar = None)
bar2 = sns.barplot(x = 'vehicle_damage', y = 'percentage', data = aux3[aux3['response'] == 1], color = 'lightblue', errorbar = None)
# add legend
top_bar = mpatches.Patch(color='darkblue', label='Response = No')
bottom_bar = mpatches.Patch(color='lightblue', label='Response = Yes')
plt.legend(handles=[top_bar, bottom_bar])
plt.ylim(0, 1.25)
plt.xticks(rotation= 90, size = 7)
plt.title('Percentage Responses by Vehicle Damage')
# show the graph
plt.show()
aux = insurance_data4.groupby('policy_sales_channel').count()['id'].reset_index()
aux = pd.merge(insurance_data4, aux, on = 'policy_sales_channel', how = 'left')
plt.figure(figsize = (18,6))
plt.title('Previously insured X Policy Sales Channel')
sns.countplot(data = aux[aux['id_y'] > 500], x = 'policy_sales_channel', hue = 'previously_insured');
plt.figure(figsize = (6,4))
plt.title('Previously insured X Vehicle Damage')
sns.countplot(data = insurance_data4, x = 'previously_insured', hue = 'vehicle_damage');
plt.figure(figsize = (6,4))
plt.title('Vehicle Age X Previously insured')
sns.countplot(data = insurance_data4, x = 'vehicle_age', hue = 'previously_insured');
aux = insurance_data4.groupby('policy_sales_channel').count()['id'].reset_index()
aux = pd.merge(insurance_data4, aux, on = 'policy_sales_channel', how = 'left')
plt.figure(figsize = (18,6))
plt.title('Policy Sales Channel X Vehicle Age')
sns.countplot(data = aux[aux['id_y'] > 1000], x = 'policy_sales_channel', hue = 'vehicle_age');
aux = insurance_data4.groupby('policy_sales_channel').count()['id'].reset_index()
aux = pd.merge(insurance_data4, aux, on = 'policy_sales_channel', how = 'left')
plt.figure(figsize = (18,6))
plt.title('Policy Sales Channel X Vehicle Damage')
sns.countplot(data = aux[aux['id_y'] > 1000], x = 'policy_sales_channel', hue = 'vehicle_damage');
# Continuous variables
plt.figure(figsize = (6,4))
corr_heatmap = insurance_data4[['age','annual_premium', 'vintage']].corr()
heatmap = sns.heatmap(corr_heatmap, vmin=-1, vmax=1, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap');
# Categorical variables
# Calculate cramer V
cat_attributes = ['gender','region_code', 'policy_sales_channel', 'previously_insured', 'response', 'vehicle_age', 'vehicle_damage']
cramer_matrix = []
for i in cat_attributes:
cramer_list = []
for j in cat_attributes:
cramer_list.append(cramer_v(insurance_data4[i], insurance_data4[j]))
cramer_matrix.append(cramer_list)
cat_corr = pd.DataFrame({'gender': cramer_matrix[0],
'region_code': cramer_matrix[1],
'policy_sales_channel': cramer_matrix[2],
'previously_insured': cramer_matrix[3],
'response': cramer_matrix[4],
'vehicle_age': cramer_matrix[5],
'vehicle_damage': cramer_matrix[6]})
cat_corr = cat_corr.set_index(cat_corr.columns)
plt.figure(figsize=(6, 4) )
plt.title('Correlation Heatmap', fontdict={'fontsize':13}, pad=12);
sns.heatmap(cat_corr, vmin=0, vmax=1, annot=True, cmap='Blues');
insurance_data5 = insurance_data4.copy()
# No missing values
ss = StandardScaler()
# Annual premium
insurance_data5['annual_premium'] = ss.fit_transform(insurance_data5[['annual_premium']].values )
mms_age = MinMaxScaler()
mms_vintage = MinMaxScaler()
# Age
insurance_data5['age'] = mms_age.fit_transform(insurance_data5[['age']].values )
# Vintage
insurance_data5['vintage'] = mms_vintage.fit_transform(insurance_data5[['vintage']].values )
# Gender
insurance_data5['gender'] = insurance_data5['gender'].apply(lambda x: 0 if x == 'Male' else 1)
# Vehicle damage
insurance_data5['vehicle_damage'] = insurance_data5['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)
# Vehicle age
insurance_data5['vehicle_age'] = insurance_data5['vehicle_age'].apply(lambda x: 0 if x == '< 1 Year' else 1 if x == '1-2 Year' else 2)
# region_code - Frequency Encoding / Target Encoding / Weighted Target Encoding
# Target Encoding
target_encode_region_code = insurance_data5.groupby('region_code')['response'].mean()
insurance_data5['region_code'] = insurance_data5['region_code'].map( target_encode_region_code )
# policy sales channel - Frequency Encoding / Target Encoding / Weighted Target Encoding
# Frequency encoding
target_encode_policy_sales = insurance_data5.groupby('policy_sales_channel')['response'].count() / len(insurance_data5)
insurance_data5['policy_sales_channel'] = insurance_data5['policy_sales_channel'].map( target_encode_policy_sales )
insurance_data6 = insurance_data5.copy()
insurance_data6 = insurance_data6.drop(['driving_license', 'id'], axis = 1)
X_train_n = insurance_data6.drop('response', axis = 1)
y_train_n = insurance_data6['response'].values
# Model definition
extra_trees = ExtraTreesClassifier( n_estimators = 250, n_jobs = -1 )
# Training model
extra_trees.fit( X_train_n, y_train_n )
# Get feature importances
importances = extra_trees.feature_importances_
# Load std of estimators feature importances
std = np.std( [tree.feature_importances_ for tree in extra_trees.estimators_], axis = 0 )
# Sort list of feature importance
indices = np.argsort( importances )[::-1]
# Print feature ranking
print('Feature Ranking: ')
df = pd.DataFrame()
for i, j in zip(X_train_n, extra_trees.feature_importances_):
aux = pd.DataFrame( {'feature': i, 'importance': j}, index = [0])
df = pd.concat( [ df, aux], axis = 0 )
df = df.sort_values( 'importance', ascending = False )
print( df )
Feature Ranking:
feature importance
0 vintage 0.284470
0 annual_premium 0.252330
0 age 0.152346
0 region_code 0.102225
0 vehicle_damage 0.076358
0 policy_sales_channel 0.057825
0 previously_insured 0.053160
0 vehicle_age 0.016250
0 gender 0.005036
# Plot impurity-based feature importances of the forest
plt.figure(figsize = (6, 4) )
plt.title('Feature Importances')
plt.bar( range (X_train_n.shape[1]), importances[indices], color = "r", yerr = std[indices], align = "center")
plt.xticks( range( X_train_n.shape[1]), df['feature'].values, rotation = 90 )
plt.xlim( [ -1, X_train_n.shape[1] ] )
plt.show()
insurance_data6 = insurance_data6.drop( ['gender', 'vehicle_age'], axis = 1 )
insurance_data7 = insurance_data6.copy()
y = insurance_data4['response']
X = insurance_data4.drop('response', axis = 1)
# Define Weights
weights = insurance_data5['response'].value_counts(normalize = True).values
weight = {0: weights[1], 1: weights[0]}
lr_model = LogisticRegression( class_weight = weight, random_state = 42 )
lr_scores = cross_validation(X = X,
y = y,
model = lr_model,
model_name = 'Logistic Regression',
test_size = 0.3,
cv = 10,
verbose = False)
lr_scores
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.12 +/- 0.024 | 0.02 +/- 0.0034 | 0.35 +/- 0.0733 | 0.72 +/- 0.0947 |
cross_validation(X = X,
y = y,
model = lr_model,
model_name = 'Logistic Regression',
test_size = 0.3,
cv = 1,
verbose = True)
Precision top K: 0.142 | Recall Top K: 0.02006783493499152 0.4 | AUC Score: 0.783 Confusion Matrix: [[59086 41095] [ 331 13821]]
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.14 +/- 0.0 | 0.02 +/- 0.0 | 0.4 +/- 0.0 | 0.78 +/- 0.0 |
<Figure size 400x400 with 0 Axes>
knn_model = KNeighborsClassifier( n_neighbors = 5, weights = 'distance' )
knn_scores = cross_validation(X = X,
y = y,
model = knn_model,
model_name = 'K-Nearest Neighbors',
test_size = 0.3,
cv = 10,
verbose = False)
knn_scores
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | K-Nearest Neighbors | 0.24 +/- 0.0071 | 0.03 +/- 0.001 | 0.12 +/- 0.0028 | 0.6 +/- 0.0024 |
knn_model = KNeighborsClassifier( n_neighbors = 3, weights = 'distance' )
cross_validation(X = X,
y = y,
model = knn_model,
model_name = 'K-Nearest Neighbors',
test_size = 0.3,
cv = 1,
verbose = True)
Precision top K: 0.229 | Recall Top K: 0.03282683486238532 0.15 | AUC Score: 0.577 Confusion Matrix: [[93808 6573] [12291 1661]]
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | K-Nearest Neighbors | 0.23 +/- 0.0 | 0.03 +/- 0.0 | 0.15 +/- 0.0 | 0.58 +/- 0.0 |
<Figure size 400x400 with 0 Axes>
rf_model = RandomForestClassifier(n_estimators = 100, max_depth = 6, random_state = 42, class_weight = weight)
rf_scores = cross_validation(X = X,
y = y,
model = rf_model,
model_name = 'Random Forest',
test_size = 0.3,
cv = 10,
verbose = False)
rf_scores
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | Random Forest | 0.39 +/- 0.0138 | 0.06 +/- 0.0017 | 0.42 +/- 0.0024 | 0.85 +/- 0.0013 |
cross_validation(X = X,
y = y,
model = rf_model,
model_name = 'Random Forest',
test_size = 0.3,
cv = 1,
verbose = True)
Precision top K: 0.3935 | Recall Top K: 0.0558195616710405 0.423 | AUC Score: 0.848 Confusion Matrix: [[64992 35242] [ 857 13242]]
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | Random Forest | 0.39 +/- 0.0 | 0.06 +/- 0.0 | 0.42 +/- 0.0 | 0.85 +/- 0.0 |
<Figure size 400x400 with 0 Axes>
xgb_model = XGBClassifier(n_estimators = 100, max_depth = 6, random_state = 42, scale_pos_weight = 8)
xgb_scores = cross_validation(X = X,
y = y,
model = xgb_model,
model_name = 'XGBoost',
test_size = 0.3,
cv = 10,
verbose = False)
xgb_scores
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | XGBoost | 0.43 +/- 0.0102 | 0.06 +/- 0.0014 | 0.43 +/- 0.0019 | 0.85 +/- 0.0009 |
xgb_model = XGBClassifier(n_estimators = 100, max_depth = 6, random_state = 42, scale_pos_weight = 8)
cross_validation(X = X,
y = y,
model = xgb_model,
model_name = 'XGBoost',
test_size = 0.3,
cv = 1,
verbose = True)
Precision top K: 0.4135 | Recall Top K: 0.058911525858384384 0.432 | AUC Score: 0.852 Confusion Matrix: [[67817 32478] [ 1211 12827]]
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | XGBoost | 0.41 +/- 0.0 | 0.06 +/- 0.0 | 0.43 +/- 0.0 | 0.85 +/- 0.0 |
<Figure size 400x400 with 0 Axes>
models = ['Logistic Regression', 'K-Nearest Neighbors', 'Random Forest', 'XGBoost']
models_performance = pd.concat([lr_scores, knn_scores, rf_scores, xgb_scores])
models_performance['Model'] = models
models_performance.set_index('Model').sort_values('Precision Top-K CV', ascending = False).set_index('Model Name')
| Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|
| Model Name | ||||
| XGBoost | 0.43 +/- 0.0102 | 0.06 +/- 0.0014 | 0.43 +/- 0.0019 | 0.85 +/- 0.0009 |
| Random Forest | 0.39 +/- 0.0138 | 0.06 +/- 0.0017 | 0.42 +/- 0.0024 | 0.85 +/- 0.0013 |
| K-Nearest Neighbors | 0.24 +/- 0.0071 | 0.03 +/- 0.001 | 0.12 +/- 0.0028 | 0.6 +/- 0.0024 |
| Logistic Regression | 0.12 +/- 0.024 | 0.02 +/- 0.0034 | 0.35 +/- 0.0733 | 0.72 +/- 0.0947 |
XGBosst: Similar Results compared to Random Forest Model, but 2x faster.
# Train test split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3)
# Preprocess data
X_train, X_test, y_train, y_test = preprocessing(X_train, X_test, y_train, y_test)
def objective(trial):
"""Define the objective function"""
params = {
'max_depth': trial.suggest_int('max_depth', 1, 9),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
'n_estimators': trial.suggest_int('n_estimators', 50, 500),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'scale_pos_weight': trial.suggest_int('scale_pos_weight',6,10),
'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
'eval_metric': 'map',#'mlogloss',
'use_label_encoder': False
}
# Fit the model
optuna_model = XGBClassifier(**params)
optuna_model.fit(X_train, y_train)
# Make predictions
y_pred = optuna_model.predict_proba(X_test)
# Evaluate predictions
precision_top_k = score_top_k(y_test, y_pred, 2000)['precision']
return precision_top_k
# study = optuna.create_study(direction='maximize')
# #study.optimize(objective, n_trials = 100 )
# print('Number of finished trials: {}'.format(len(study.trials)))
# print('Best trial:')
# trial = study.best_trial
# print(' Value: {}'.format(trial.value))
# print(' Params: ')
# for key, value in trial.params.items():
# print(' {}: {}'.format(key, value))
# Parameters set with best score
#params = trial.params
params = {'max_depth': 8,
'learning_rate': 0.014416504517463456,
'n_estimators': 499,
'min_child_weight': 10,
'scale_pos_weight': 10,
'gamma': 0.5834982442488319,
'subsample': 0.3703846362615623,
'colsample_bytree': 0.6808135028844782,
'reg_alpha': 1.001572471362163e-05,
'reg_lambda': 1.1690461872764534e-05}
model = XGBClassifier(**params)
scores = cross_validation(X = X,
y = y,
model = model,
model_name = 'XGBoost',
test_size = 0.3,
cv = 10,
verbose = False)
scores
| Model Name | Precision Top-K CV | Recall Top-K CV | F1 Score CV | AUC Score CV | |
|---|---|---|---|---|---|
| 0 | XGBoost | 0.43 +/- 0.0092 | 0.06 +/- 0.0014 | 0.42 +/- 0.0022 | 0.85 +/- 0.0008 |
insurance_data9 = insurance_data7.copy()
It shows the relationship between the number of customers contacted on the list ordered by the algorithm and the success rate. The closer the cumulative gains line is to the top-left corner of the chart, the greater the gain, the higher the proportion of the responders that are reached for the lower proportion of customers contacted.
skplt.metrics.plot_cumulative_gain(y_test, model.predict_proba(X_test), figsize = (6,6));
The optimum point is about 30% of the candidates, so we can reach 80% of the customers interested in the insurance.
It shows how good the model is compared to a random one, according to the number of customers contacted. Ex: Lift = 2 means that the model is two times better than a random one.
.
skplt.metrics.plot_lift_curve(y_test, model.predict_proba(X_test), figsize = (6,6));
If we call about 30% of the available customers, the model will be about 2.7 times better than calling randomly.
predict_proba = model.predict_proba(X_test)
total_customers = 127000.0
precision = [score_top_k(y_test, predict_proba, total_customers * 0.05)['precision'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.1 )['precision'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.2 )['precision'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.3 )['precision'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.4 )['precision'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.5 )['precision']]
recall = [score_top_k(y_test, predict_proba, total_customers * 0.05)['recall'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.1 )['recall'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.2 )['recall'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.3 )['recall'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.4 )['recall'].item(),
score_top_k(y_test, predict_proba, total_customers * 0.5 )['recall']]
percentage_customers = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
no_customers = [( percentage_customers[0] * total_customers ) ,
( percentage_customers[1] * total_customers ) ,
( percentage_customers[2] * total_customers ) ,
( percentage_customers[3] * total_customers ) ,
( percentage_customers[4] * total_customers ) ,
( percentage_customers[5] * total_customers ) ]
recall_rand = [( 0.123 * no_customers[0] ) / (0.123 * total_customers),
( 0.123 * no_customers[1] ) / (0.123 * total_customers),
( 0.123 * no_customers[2] ) / (0.123 * total_customers),
( 0.123 * no_customers[3] ) / (0.123 * total_customers),
( 0.123 * no_customers[4] ) / (0.123 * total_customers),
( 0.123 * no_customers[5] ) / (0.123 * total_customers)]
investment = [no_customers[0] * 2,
no_customers[1] * 2,
no_customers[2] * 2,
no_customers[3] * 2,
no_customers[4] * 2,
no_customers[5] * 2]
roi_model = [ 5280.0 * 0.05 * total_customers * precision[0],
5280.0 * 0.1 * total_customers * precision[1],
5280.0 * 0.2 * total_customers * precision[2],
5280.0 * 0.3 * total_customers * precision[3],
5280.0 * 0.4 * total_customers * precision[4],
5280.0 * 0.5 * total_customers * precision[5]]
roi_rand = [5280.0 * 0.05 * total_customers * 0.123,
5280.0 * 0.1 * total_customers * 0.123,
5280.0 * 0.2 * total_customers * 0.123,
5280.0 * 0.3 * total_customers * 0.123,
5280.0 * 0.4 * total_customers * 0.123,
5280.0 * 0.5 * total_customers * 0.123 ]
df_info = pd.DataFrame([np.round(percentage_customers,2),
np.round(no_customers,0),
np.round(investment, 2),
np.round(np.round(roi_rand, 2 ) / 1000000, 2),
np.round(recall_rand, 2),
np.round(np.round(roi_model, 2) / 1000000, 2),
np.round(recall, 2)]).T
df_info.columns = ['percentage customers', 'No. Customers', 'Investment ($)',
'Random Calls Income ($)', 'Random Model Recall','ML Model Income($)', 'ML Model Recall']
df_info['Random Calls Income ($)'] = df_info['Random Calls Income ($)'].apply(lambda x: "$ " + str(x) + "M")
df_info['ML Model Income ($)'] = df_info['ML Model Income($)'].apply(lambda x: "$ " + str(x) + "M")
df_info
| percentage customers | No. Customers | Investment ($) | Random Calls Income ($) | Random Model Recall | ML Model Income($) | ML Model Recall | ML Model Income ($) | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.05 | 6350.0 | 12700.0 | $ 4.12M | 0.05 | 14.29 | 0.19 | $ 14.29M |
| 1 | 0.10 | 12700.0 | 25400.0 | $ 8.25M | 0.10 | 26.25 | 0.36 | $ 26.25M |
| 2 | 0.20 | 25400.0 | 50800.0 | $ 16.5M | 0.20 | 46.91 | 0.64 | $ 46.91M |
| 3 | 0.30 | 38100.0 | 76200.0 | $ 24.74M | 0.30 | 62.55 | 0.85 | $ 62.55M |
| 4 | 0.40 | 50800.0 | 101600.0 | $ 32.99M | 0.40 | 71.07 | 0.97 | $ 71.07M |
| 5 | 0.50 | 63500.0 | 127000.0 | $ 41.24M | 0.50 | 73.45 | 1.00 | $ 73.45M |
What percentage of customers interested in health insurance the sales team will be able to reach if they make 20000 calls?
print(np.round(score_top_k(y_test, predict_proba, 20000)['recall'].item(),2))
0.52
What percentage of customers interested in health insurance the sales team will be able to reach if they make 40000 calls?
print(np.round(score_top_k(y_test, predict_proba, 40000)['recall'].item(),2))
0.88
How many phone calls the sales team have to make to reach 80% of the interested customers?
print("35000 calls:", np.round(score_top_k(y_test, predict_proba, 35000)['recall'].item(),1) * 100, "% of the customers.")
35000 calls: 80.0 % of the customers.
X = insurance_data4.drop('response', axis = 1)
y = insurance_data4['response']
######## Preprocessing ##########
# Vehicle damage
X['vehicle_damage'] = X['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)
# region_code
aux = X.copy()
aux['response'] = y
target_encoding_region_code = aux.groupby('region_code')['response'].mean()
target_encoding_region_code.to_csv('../src/parameter/target_encoding_region_code.csv', index = True )
X['region_code'] = X['region_code'].map( target_encoding_region_code )
X['region_code'].fillna(0, inplace = True)
# policy sales channel
frequency_encoding_policy_sales = aux.groupby('policy_sales_channel')['response'].count() / len(insurance_data5)
frequency_encoding_policy_sales.to_csv('../src/parameter/frequency_encoding_policy_sales.csv', index = True )
X['policy_sales_channel'] = X['policy_sales_channel'].map( frequency_encoding_policy_sales )
X['policy_sales_channel'].fillna(0, inplace = True)
# Feature selection
X = X[['age', 'region_code', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage', 'vehicle_damage']]
######## Create ML Model ##########
params = {'max_depth': 9,
'learning_rate': 0.014331889541297329,
'n_estimators': 313,
'min_child_weight': 5,
'scale_pos_weight': 8,
'gamma': 2.3488317973981494e-06,
'subsample': 0.07868170019741563,
'colsample_bytree': 0.8875847384272901,
'reg_alpha': 0.00019823852240508522,
'reg_lambda': 1.2406673503434204e-07}
# Train model
xgb_model = XGBClassifier(**params).fit(X, y)
# Save model pkl
pickle.dump( xgb_model, open( '../src/model/xgb_model.pkl', 'wb' ) )
import pandas as pd
class HealthInsurance:
def __init__(self):
self.region_code_path = '../src/parameter/target_encoding_region_code.csv'
self.policy_sales_path = '../src/parameter/frequency_encoding_policy_sales.csv'
self.age_mms = pickle.load(open('../src/parameter/mms_age.pkl', 'rb'))
self.vintage_mms = pickle.load(open('../src/parameter/mms_vintage.pkl','rb'))
self.annual_premium_ss = pickle.load(open('../src/parameter/ss_annual_premium.pkl', 'rb'))
def preprocessing(self):
# Age
X['age'] = self.age_mms.transform(X[['age']].values )
# Vintage
X['vintage'] = self.vintage_mms.transform(X[['vintage']].values )
# Annual premium
X['annual_premium'] = self.annual_premium_ss.transform(X[['annual_premium']].values )
# Vehicle damage
X = self.copy()
X['vehicle_damage'] = X['vehicle_damage'].apply(lambda x: 0 if x == 'No' else 1)
# region_code
target_encoding_region_code = pd.read_csv(self.region_code_path).set_index('region_code')['response']
X['region_code'] = X['region_code'].map( target_encoding_region_code )
X['region_code'].fillna(0, inplace = True)
# policy sales channel
frequency_encoding_policy_sales = pd.read_csv(self.policy_sales_path).set_index('policy_sales_channel')['response']
X['policy_sales_channel'] = X['policy_sales_channel'].map( frequency_encoding_policy_sales )
X['policy_sales_channel'].fillna(0, inplace = True)
# Feature selection
X = X[['age', 'region_code', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage', 'vehicle_damage']]
def get_prediction( self, model, original_data, preocessed_data):
predictions = model.predict_proba(processed_data)[::,1]
# join predictions into the original data
original_data['predictions'] = predictions
original_data.sort_values( 'predictions', ascending = False )